This workflow reanalyze the single nucleus RNA-seq data produced by (Habib et al. 2017), using DroNc-seq: massively parallel sNuc-seq with droplet technology.
We will first load a few libraries. Among them, * DropletUtils provides functions for data from droplet technologies such as 10X Genomics. * biomaRt provides easy access to databases, such as Ensembl, COSMIC, Uniprot, HGNC, etc. * scater is a collection of tools for doing quality control analyses of scRNA-seq * scran provide functions for normalization of cell-specific libary sizes, correcting batch effects, and identification marker genes
bio_packs = c("SingleCellExperiment","DropletUtils","biomaRt",
"scater","scran","limma","org.Hs.eg.db","SC3")
source("https://bioconductor.org/biocLite.R")
for(pack1 in bio_packs){
if( !pack1 %in% installed.packages()[,"Package"]){
biocLite(pack1, suppressUpdates = TRUE)
}
}
cran_packs = c("data.table","svd","Rtsne","stringi","irlba")
for(pack1 in cran_packs){
if( !pack1 %in% installed.packages()[,"Package"]){
install.packages(pack1)
}
}
library(data.table)
library(SingleCellExperiment)
library(DropletUtils)
library(biomaRt)
library(scater)
library(scran)
library(limma)
library(ggplot2)We setup different directories, and set num_cores improve SC3 runtime, if run_sc3 is TRUE. The step of runing SC clustering can take five or more hours without using more cores.
repo_dir = "~/research/GitHub/scRNAseq_pipelines"
work_dir = file.path(repo_dir,"dronc")
dronc_dir = "~/research/scRNAseq/data/GTEx_droncseq_hip_pcf"
# repo_dir = "/pine/scr/p/l/pllittle/CS_eQTL/s3_Real/scRNAseq_pipelines"
# work_dir = file.path(repo_dir,"dronc")
# dronc_dir = file.path(work_dir,"GTEx_droncseq_hip_pcf")
source(file.path(repo_dir,"SOURCE.R"))
run_sc3 = TRUE
num_cores = ifelse(run_sc3,7,1)Next we import in the count data and other available information. The dataset is available here.
counts_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.umi_counts.txt.gz")
counts = fread(cmd=sprintf("zcat < %s",counts_fn),data.table = FALSE)
dim(counts); counts[1:3,1:2]## [1] 32111 14964
## V1 hHP1_AACACTATCTAC
## 1 A1BG 0
## 2 A1BG-AS1 0
## 3 A1CF 0
rownames(counts) = counts$V1
counts = as.matrix(counts[,-1])
colnames(counts)[1:10]## [1] "hHP1_AACACTATCTAC" "hHP1_CTACGCATCCAT" "hHP1_TCGGTACTAATA"
## [4] "hHP1_CCCGCACGCTAT" "hHP1_TCATTTTGTCAT" "hHP1_ACGAGGTCTATG"
## [7] "hHP1_AGTCATGAGGTT" "hHP1_GTTAGTATACCA" "hHP1_GCATTCAGTAAG"
## [10] "hHP1_AGACCGCGACTA"
part_cell_id = sapply(colnames(counts),
function(xx) strsplit(xx,"_")[[1]][1],
USE.NAMES=FALSE)
col_dat = smart_df(sample_name = colnames(counts),
part_cell_id = part_cell_id)
col_dat[1:5,]## sample_name part_cell_id
## 1 hHP1_AACACTATCTAC hHP1
## 2 hHP1_CTACGCATCCAT hHP1
## 3 hHP1_TCGGTACTAATA hHP1
## 4 hHP1_CCCGCACGCTAT hHP1
## 5 hHP1_TCATTTTGTCAT hHP1
Import clustering results by (Habib et al. 2017). We utilize the supplemental files to label clusters and incorporate the existing TSNE results. Refer to the file nmeth.4407-S10.xlsx.
clust_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.clusters.txt.gz")
clust = fread(cmd = sprintf("zcat < %s",clust_fn),data.table = FALSE)
dim(clust); clust[1:5,]## [1] 14963 2
## V1 V2
## 1 hHP1_AACACTATCTAC 4
## 2 hHP1_CTACGCATCCAT 3
## 3 hHP1_TCGGTACTAATA 3
## 4 hHP1_CCCGCACGCTAT 3
## 5 hHP1_TCATTTTGTCAT 3
names(clust) = c("sample_name","Habib_clusters")
clust$Habib_clusters = as.factor(clust$Habib_clusters)
# Double check sample names
all(col_dat$sample_name == clust$sample_name)## [1] TRUE
map_clust_name = smart_df(Habib_clusters = as.factor(seq(19)),
Habib_clusters_name = c("exPFC1","exPFC2","exCA1","exCA3",
"GABA1","GABA2","exDG","ASC1","ASC2","ODC1","ODC2",
"OPC","MG","NSC","END",rep(NA,4)),
Habib_cell_type = c("exPFC","exPFC","exCA","exCA",
"GABA","GABA","exDG","ASC","ASC","ODC","ODC","OPC",
"MG","NSC","END",rep(NA,4)))
clust = smart_merge(clust,map_clust_name)
clust = clust[match(col_dat$sample_name,clust$sample_name),]
clust[1:5,]## Habib_clusters sample_name Habib_clusters_name Habib_cell_type
## 9142 4 hHP1_AACACTATCTAC exCA3 exCA
## 8826 3 hHP1_CTACGCATCCAT exCA1 exCA
## 8732 3 hHP1_TCGGTACTAATA exCA1 exCA
## 8735 3 hHP1_CCCGCACGCTAT exCA1 exCA
## 8731 3 hHP1_TCATTTTGTCAT exCA1 exCA
We marge the TSNE results from (Habib et al. 2017) into the sce object. We also check for spike-ins, and as expected, there is no spike-ins in this dataset.
tsne_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.tsne.txt.gz")
df_tsne = fread(cmd = sprintf("zcat < %s",tsne_fn),data.table = FALSE)
dim(df_tsne); df_tsne[1:5,]## [1] 14963 3
## V1 tSNE_1 tSNE_2
## 1 hHP1_AACACTATCTAC -8.436537 7.721106
## 2 hHP1_CTACGCATCCAT -6.606278 8.099538
## 3 hHP1_TCGGTACTAATA -6.724967 8.011943
## 4 hHP1_CCCGCACGCTAT -6.286364 7.892359
## 5 hHP1_TCATTTTGTCAT -10.455725 2.374719
names(df_tsne) = c("sample_name",paste0("Habib_TSNE",1:2))
table(df_tsne$sample_name == clust$sample_name)##
## TRUE
## 14963
table(df_tsne$sample_name == clust$sample_name)##
## TRUE
## 14963
is_match = (col_dat$sample_name == df_tsne$sample_name) &
(df_tsne$sample_name == clust$sample_name); table(is_match)## is_match
## TRUE
## 14963
col_dat = cbind(col_dat,
clust[,names(clust) != "sample_name"],
df_tsne[,names(df_tsne) != "sample_name"])
col_dat[1:5,]## sample_name part_cell_id Habib_clusters Habib_clusters_name
## 9142 hHP1_AACACTATCTAC hHP1 4 exCA3
## 8826 hHP1_CTACGCATCCAT hHP1 3 exCA1
## 8732 hHP1_TCGGTACTAATA hHP1 3 exCA1
## 8735 hHP1_CCCGCACGCTAT hHP1 3 exCA1
## 8731 hHP1_TCATTTTGTCAT hHP1 3 exCA1
## Habib_cell_type Habib_TSNE1 Habib_TSNE2
## 9142 exCA -8.436537 7.721106
## 8826 exCA -6.606278 8.099538
## 8732 exCA -6.724967 8.011943
## 8735 exCA -6.286364 7.892359
## 8731 exCA -10.455725 2.374719
sce = SingleCellExperiment(assays = list(counts = counts),colData = col_dat)
sce## class: SingleCellExperiment
## dim: 32111 14963
## metadata(0):
## assays(1): counts
## rownames(32111): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
## Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rownames(sce)[grep("^ERCC",rownames(sce))]## [1] "ERCC1" "ERCC2" "ERCC3" "ERCC4" "ERCC5" "ERCC6" "ERCC6L"
## [8] "ERCC6L2" "ERCC8"
We will extract annotation information based on gene names. Since the existing count data from (Habib et al. 2017) were based on mapping to hg19, we use this version of reference genome.
anno_file = file.path(work_dir,"gene.annotation_dronc.rds")
if( file.exists(anno_file) ){
gene_anno = readRDS(anno_file)
} else{
ensembl = useEnsembl(biomart="ensembl",GRCh=37,
dataset="hsapiens_gene_ensembl")
attr_string = c("hgnc_symbol","external_gene_name","chromosome_name",
"start_position","end_position","strand","description",
"percentage_gene_gc_content","gene_biotype")
gene_anno = getBM(attributes = attr_string,
filters = "external_gene_name",
values = rownames(sce),
mart = ensembl)
saveRDS(gene_anno, file=anno_file)
}
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34526 9
We remove those genes that are part of extracted annotation, but aren’t in sce.
w2rm = which(!gene_anno$external_gene_name %in% rownames(sce))
w2rm## [1] 4232 4645
gene_anno[w2rm,]## hgnc_symbol external_gene_name chromosome_name start_position
## 4232 C10ORF68 10 32832297
## 4645 C1ORF220 1 178511950
## end_position strand
## 4232 33171802 1
## 4645 178517579 1
## description
## 4232 Homo sapiens coiled-coil domain containing 7 (CCDC7), transcript variant 5, mRNA. [Source:RefSeq mRNA;Acc:NM_024688]
## 4645
## percentage_gene_gc_content gene_biotype
## 4232 37.23 protein_coding
## 4645 49.91 protein_coding
gene_anno = gene_anno[-w2rm,]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34524 9
Many genes have multiple entries in the annotation, often because they are annotated to scaffolds, assembly patches and alternate loci. Here we simply remove such entries. The we remove duplicated annotations and genes without annotations.
table(gene_anno$chromosome_name)[1:30]##
## 1 10 11 12 13 14
## 3084 1252 1741 1694 637 1145
## 15 16 17 18 19 2
## 1169 1426 1820 633 1955 2186
## 20 21 22 3 4 5
## 800 383 704 1784 1301 1567
## 6 7 8 9 GL000192.1 GL000193.1
## 1622 1446 1296 1207 2 1
## GL000194.1 GL000195.1 GL000199.1 GL000204.1 GL000205.1 GL000213.1
## 2 1 1 1 2 1
chr_nms = c(1:22,"X","Y","MT")
gene_anno = gene_anno[which(gene_anno$chromosome_name %in% chr_nms),]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 31978 9
t1 = table(gene_anno$external_gene_name)
t2 = sort(t1[t1 > 1], decreasing=TRUE)
length(t2)## [1] 40
t2[1:10]##
## SNORD113 MIR1302-2 NPIPA7 CCDC177 CDRT1
## 6 4 3 2 2
## CEBPA-AS1 FAM226B FAM47E-STBD1 GATS GOLGA7B
## 2 2 2 2 2
gene_anno[which(gene_anno$external_gene_name %in% names(t2)[1:4]), 1:4]## hgnc_symbol external_gene_name chromosome_name start_position
## 5013 CCDC177 CCDC177 14 70037483
## 5014 CCDC177 14 70038216
## 14832 MIR1302-11 MIR1302-2 19 71161
## 14833 MIR1302-10 MIR1302-2 19 71161
## 14834 MIR1302-9 MIR1302-2 19 71161
## 14835 MIR1302-2 MIR1302-2 19 71161
## 16502 NPIPA7 16 16411301
## 16503 NPIPA7 NPIPA7 16 16472912
## 16505 NPIPA7 16 18451943
## 29789 SNORD113 14 101422577
## 29810 SNORD113 14 101443726
## 29811 SNORD113 14 101445339
## 29812 SNORD113 14 101446329
## 29825 SNORD113 14 101460594
## 29826 SNORD113 14 101464804
w_duplicate = which(gene_anno$external_gene_name %in% names(t2))
ganno2 = gene_anno[w_duplicate,]
dim(ganno2)## [1] 87 9
table(ganno2$hgnc_symbol == ganno2$external_gene_name)##
## FALSE TRUE
## 46 41
ganno2 = ganno2[which(ganno2$hgnc_symbol == ganno2$external_gene_name),]
dim(ganno2)## [1] 41 9
ganno2 = dplyr::distinct(ganno2,external_gene_name,.keep_all = TRUE)
dim(ganno2)## [1] 39 9
gene_anno = rbind(gene_anno[-w_duplicate,], ganno2)
dim(gene_anno)## [1] 31930 9
table(gene_anno$gene_biotype)##
## 3prime_overlapping_ncrna antisense IG_C_gene
## 11 4152 3
## IG_V_gene lincRNA miRNA
## 6 4807 966
## misc_RNA Mt_rRNA Mt_tRNA
## 935 2 18
## processed_transcript protein_coding pseudogene
## 394 18296 3
## rRNA sense_intronic sense_overlapping
## 204 648 172
## snoRNA snRNA TR_C_gene
## 201 1083 5
## TR_J_gene TR_V_gene
## 7 17
gene_missing = setdiff(rownames(sce),gene_anno$external_gene_name)
gene_missing[1:10]## [1] "AC005152.2" "AC006132.1" "AC006445.8" "AC007092.1" "AC007390.5"
## [6] "AC007464.1" "AC009232.2" "AC009236.1" "AC010127.5" "AC011043.1"
length(gene_missing)## [1] 181
w2kp = match(gene_anno$external_gene_name,rownames(sce))
table(is.na(w2kp))##
## FALSE
## 31930
gene_anno$external_gene_name[which(is.na(w2kp))]## character(0)
sce = sce[w2kp,]
dim(sce)## [1] 31930 14963
rowData(sce) = gene_anno
sce## class: SingleCellExperiment
## dim: 31930 14963
## metadata(0):
## assays(1): counts
## rownames(31930): AC006946.16 AC006946.17 ... ZNF8 ZNF788
## rowData names(9): hgnc_symbol external_gene_name ...
## percentage_gene_gc_content gene_biotype
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
## Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rowData(sce)[1:5,]## DataFrame with 5 rows and 9 columns
## hgnc_symbol external_gene_name chromosome_name start_position
## <character> <character> <character> <integer>
## 1 AC006946.16 22 17548711
## 2 AC006946.17 22 17561591
## 3 AC004019.13 22 18062923
## 4 ABHD3 ABHD3 18 19230858
## 5 AC004471.10 22 19111822
## end_position strand
## <integer> <integer>
## 1 17551565 1
## 2 17562346 1
## 3 18071958 -1
## 4 19284766 -1
## 5 19115962 -1
## description
## <character>
## 1
## 2
## 3
## 4 abhydrolase domain containing 3 [Source:HGNC Symbol;Acc:18718]
## 5
## percentage_gene_gc_content gene_biotype
## <numeric> <character>
## 1 41.61 lincRNA
## 2 41.67 lincRNA
## 3 52.6 antisense
## 4 42.45 protein_coding
## 5 58.3 lincRNA
Please refer to this workflow in bioconductor for reference.
bcrank = barcodeRanks(counts(sce))
# Only show unique points for plotting speed.
uniq = !duplicated(bcrank$rank)
par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=bcrank$inflection, col="darkgreen", lty=2,lwd=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2,lwd=2)
legend("left", legend=c("Inflection", "Knee"), bty="n",
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2,lwd=2)bcrank$inflection## hCf_TAATAGCGGTAC
## 221
bcrank$knee## PFC2-A2_CACAGCGCTGTC
## 580
summary(bcrank$total)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 459.0 716.0 948.9 1167.0 9423.0
table(bcrank$total >= bcrank$knee)##
## FALSE TRUE
## 5513 9450
table(bcrank$total >= bcrank$inflection)##
## FALSE TRUE
## 1 14962
set.seed(100)
date()## [1] "Wed Jan 30 11:52:07 2019"
e_out = emptyDrops(counts(sce))
date()## [1] "Wed Jan 30 11:52:28 2019"
e_out## DataFrame with 14963 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## hHP1_AACACTATCTAC 4484 NaN 1 FALSE 0
## hHP1_CTACGCATCCAT 4919 NaN 1 FALSE 0
## hHP1_TCGGTACTAATA 5163 NaN 1 FALSE 0
## hHP1_CCCGCACGCTAT 5030 NaN 1 FALSE 0
## hHP1_TCATTTTGTCAT 3588 NaN 1 FALSE 0
## ... ... ... ... ... ...
## PFC-CD_TTGCCTGGCGGG 590 NaN 1 FALSE 0
## PFC-CD_CACGCTCCCCTA 269 NaN 1 FALSE 1
## PFC-CD_GCTCTACAACCG 522 NaN 1 FALSE 1
## PFC-CD_CTCCATTCATGC 497 NaN 1 FALSE 1
## PFC-CD_CGTCATTAGCAG 568 NaN 1 FALSE 1
length(unique(e_out$FDR))## [1] 2
table(e_out$FDR)##
## 0 1
## 9450 5513
tapply(e_out$Total, e_out$FDR, summary)## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 581 748 1011 1268 1487 9423
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 317.0 391.0 402.3 489.0 580.0
From the above analysis, some cells with very small number of UMIs. Here we chose do not remove any cells because it appears all these 14,963 cells were used in the main anlaysis by (Habib et al. 2017). Based on Figure 2 of their paper, there are
14,963 DroNc-seq nuclei profiles (each with >10,000 reads and >200 genes)
We will generate a set of QC features per cell, including the expression of mitochondira/ribosomal genes. We identify ribosomal genes based on annoation from https://www.genenames.org/.
ribo_fn = file.path(work_dir,"ribosome_genes.txt")
ribo = smart_RT(ribo_fn,sep='\t',header=TRUE)
ribo[1:2,]## HGNC.ID Approved.Symbol Approved.Name Status Previous.Symbols
## 1 10298 RPL10 ribosomal protein L10 Approved
## 2 10299 RPL10A ribosomal protein L10a Approved NEDD6
## Synonyms Chromosome Accession.Numbers
## 1 NOV, QM, DXS648E, DXS648, FLJ23544, L10 Xq28 AB007170
## 2 Csa-19, L10A 6p21.31 U12404
## RefSeq.IDs Gene.Family.Tag Gene.family.description Gene.family.ID
## 1 NM_006013 RPL L ribosomal proteins 729
## 2 NM_007104 RPL L ribosomal proteins 729
is_mito = which(rowData(sce)$chromosome_name == "MT")
is_ribo = which(rowData(sce)$external_gene_name %in% ribo$Approved.Symbol)
length(is_mito)## [1] 33
length(is_ribo)## [1] 160
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is_mito, Ri=is_ribo))
sort(colnames(colData(sce)))## [1] "Habib_cell_type"
## [2] "Habib_clusters"
## [3] "Habib_clusters_name"
## [4] "Habib_TSNE1"
## [5] "Habib_TSNE2"
## [6] "is_cell_control"
## [7] "log10_total_counts"
## [8] "log10_total_counts_endogenous"
## [9] "log10_total_counts_feature_control"
## [10] "log10_total_counts_Mt"
## [11] "log10_total_counts_Ri"
## [12] "log10_total_features"
## [13] "log10_total_features_by_counts"
## [14] "log10_total_features_by_counts_endogenous"
## [15] "log10_total_features_by_counts_feature_control"
## [16] "log10_total_features_by_counts_Mt"
## [17] "log10_total_features_by_counts_Ri"
## [18] "log10_total_features_endogenous"
## [19] "log10_total_features_feature_control"
## [20] "log10_total_features_Mt"
## [21] "log10_total_features_Ri"
## [22] "part_cell_id"
## [23] "pct_counts_endogenous"
## [24] "pct_counts_feature_control"
## [25] "pct_counts_in_top_100_features"
## [26] "pct_counts_in_top_100_features_endogenous"
## [27] "pct_counts_in_top_100_features_feature_control"
## [28] "pct_counts_in_top_100_features_Ri"
## [29] "pct_counts_in_top_200_features"
## [30] "pct_counts_in_top_200_features_endogenous"
## [31] "pct_counts_in_top_50_features"
## [32] "pct_counts_in_top_50_features_endogenous"
## [33] "pct_counts_in_top_50_features_feature_control"
## [34] "pct_counts_in_top_50_features_Ri"
## [35] "pct_counts_in_top_500_features"
## [36] "pct_counts_in_top_500_features_endogenous"
## [37] "pct_counts_Mt"
## [38] "pct_counts_Ri"
## [39] "pct_counts_top_100_features"
## [40] "pct_counts_top_100_features_endogenous"
## [41] "pct_counts_top_100_features_feature_control"
## [42] "pct_counts_top_100_features_Ri"
## [43] "pct_counts_top_200_features"
## [44] "pct_counts_top_200_features_endogenous"
## [45] "pct_counts_top_50_features"
## [46] "pct_counts_top_50_features_endogenous"
## [47] "pct_counts_top_50_features_feature_control"
## [48] "pct_counts_top_50_features_Ri"
## [49] "pct_counts_top_500_features"
## [50] "pct_counts_top_500_features_endogenous"
## [51] "sample_name"
## [52] "total_counts"
## [53] "total_counts_endogenous"
## [54] "total_counts_feature_control"
## [55] "total_counts_Mt"
## [56] "total_counts_Ri"
## [57] "total_features"
## [58] "total_features_by_counts"
## [59] "total_features_by_counts_endogenous"
## [60] "total_features_by_counts_feature_control"
## [61] "total_features_by_counts_Mt"
## [62] "total_features_by_counts_Ri"
## [63] "total_features_endogenous"
## [64] "total_features_feature_control"
## [65] "total_features_Mt"
## [66] "total_features_Ri"
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="",
breaks=20, col="grey80", ylab="Number of cells")
hist(log10(sce$total_features), xlab="log10(# of expressed genes)",
main="", breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
ylab="Number of cells", breaks=40, main="", col="grey80")
hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)",
ylab="Number of cells", breaks=80, main="", col="grey80")par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
smoothScatter(log10(sce$total_counts), log10(sce$total_features),
xlab="log10(Library sizes)", ylab="log10(# of expressed genes)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Ri,
xlab="log10(Library sizes)", ylab="Ribosome prop. (%)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Mt,
xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)")
smoothScatter(sce$pct_counts_Ri,sce$pct_counts_Mt,
xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)")From the QC results, we will filter on the metrics by identifying outliers using isOutlier. Note that the cells assigned to cluster 18 by (Habib et al. 2017) will all be excluded.
libsize_drop = isOutlier(sce$total_counts,nmads = 3,type = "lower",log = TRUE)
feature_drop = isOutlier(sce$total_features_by_counts,nmads = 3,
type = "lower",log = TRUE)
mito_drop = isOutlier(sce$pct_counts_Mt,nmads = 3,type = "higher")
ribo_drop = isOutlier(sce$pct_counts_Ri,nmads = 3,type = "higher")
keep = !(libsize_drop | feature_drop | mito_drop | ribo_drop)
smart_df(ByLibSize = sum(libsize_drop),ByFeature = sum(feature_drop),
ByMito = sum(mito_drop),ByRibo = sum(ribo_drop),
Remaining = sum(keep))## ByLibSize ByFeature ByMito ByRibo Remaining
## 1 0 0 1193 735 13181
smart_table(colData(sce)$Habib_clusters,keep)## keep
## FALSE TRUE
## 1 56 3048
## 2 4 293
## 3 31 390
## 4 8 737
## 5 18 874
## 6 51 772
## 7 32 1421
## 8 126 1078
## 9 124 581
## 10 234 1484
## 11 92 1155
## 12 106 578
## 13 108 281
## 14 40 161
## 15 279 120
## 16 171 83
## 17 126 74
## 18 132 0
## 19 44 51
smart_table(colData(sce)$Habib_clusters_name,keep)## keep
## FALSE TRUE
## ASC1 126 1078
## ASC2 124 581
## END 279 120
## exCA1 31 390
## exCA3 8 737
## exDG 32 1421
## exPFC1 56 3048
## exPFC2 4 293
## GABA1 18 874
## GABA2 51 772
## MG 108 281
## NSC 40 161
## ODC1 234 1484
## ODC2 92 1155
## OPC 106 578
## <NA> 473 208
sce = sce[,keep]
dim(sce)## [1] 31930 13181
rowData(sce)[1:2,]## DataFrame with 2 rows and 20 columns
## hgnc_symbol external_gene_name chromosome_name start_position
## <character> <character> <character> <integer>
## 1 AC006946.16 22 17548711
## 2 AC006946.17 22 17561591
## end_position strand description percentage_gene_gc_content
## <integer> <integer> <character> <numeric>
## 1 17551565 1 41.61
## 2 17562346 1 41.67
## gene_biotype is_feature_control is_feature_control_Mt
## <character> <logical> <logical>
## 1 lincRNA FALSE FALSE
## 2 lincRNA FALSE FALSE
## is_feature_control_Ri mean_counts log10_mean_counts
## <logical> <numeric> <numeric>
## 1 FALSE 0.00548018445498897 0.0023735161394708
## 2 FALSE 0.00360890195816347 0.00156450482886486
## n_cells_by_counts pct_dropout_by_counts total_counts log10_total_counts
## <integer> <numeric> <integer> <numeric>
## 1 79 99.4720310098242 82 1.91907809237607
## 2 52 99.6524761077324 54 1.74036268949424
## n_cells_counts pct_dropout_counts
## <integer> <numeric>
## 1 79 99.4720310098242
## 2 52 99.6524761077324
summary(rowData(sce)$mean_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00007 0.00033 0.00287 0.02972 0.02484 107.63510
summary(rowData(sce)$mean_counts[rowData(sce)$mean_counts>0])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00007 0.00033 0.00287 0.02972 0.02484 107.63510
summary(rowData(sce)$n_cells_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 5.0 40.5 307.8 341.0 13416.0
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80", main="",
breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_counts+1), col="grey80", main="",
breaks=40, xlab="log10(# of expressed cells + 1)")
plot(log10(rowData(sce)$mean_counts+1e-6), pch=16, col=rgb(0,0,0,0.4),
log10(rowData(sce)$n_cells_counts + 1),
xlab="log10(ave # of UMI + 1e-6)",
ylab="log10(# of expressed cells + 1)")tb1 = table(rowData(sce)$n_cells_counts)
tb1[1:11]##
## 1 2 3 4 5 6 7 8 9 10 11
## 3302 1958 1280 1029 843 667 552 525 473 383 369
We remove those genes that are lowly expressed. (Habib et al. 2017) mentioned in section “Gene detection and quality controls” of Online Methods
Nuclei with less than 200 detected genes and less than 10,000 usable reads were filtered out.
and
A gene is considered detected in a cell if it has at least two unique UMIs (transcripts) associated with it. For each analysis, genes were removed that were detected in less than 10 nuclei.
Therefore it seems we should remove all the nuclei. having less than 200 genes with two or more UMI counts. However, this would remove the majority of the nuclei. Therefore, we conlcude that they meant to remove the nuclei. having less than 200 genes with one or more UMI counts. To filter genes, we follow their threshold to remove genes with two or more UMIs in less than 10 nuclei.
Note that the variable strand need to be renamed, otherwise there is an error message that such a variabel name cannot be used.
names(rowData(sce))[names(rowData(sce)) == "strand"] = "strand_n"
n_genes = colSums(counts(sce) >= 2)
summary(n_genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 43.0 71.0 112.5 133.0 1555.0
table(n_genes >= 100)##
## FALSE TRUE
## 8440 4741
table(n_genes >= 200)##
## FALSE TRUE
## 11339 1842
n_genes = colSums(counts(sce) >= 1)
summary(n_genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 198.0 351.0 540.0 670.5 845.0 4233.0
table(n_genes >= 100)##
## TRUE
## 13181
table(n_genes >= 200)##
## FALSE TRUE
## 2 13179
n_cells = rowSums(counts(sce) >= 2)
summary(n_cells)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 46.45 22.00 11725.00
table(n_cells >= 10)##
## FALSE TRUE
## 21323 10607
sce = sce[which(n_cells >= 10),]
dim(sce)## [1] 10607 13181
Next we check those highly expressed genes
par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1,
names.arg=rowData(sce)$hgnc_symbol[od1[20:1]],
horiz=TRUE, cex.names=0.8, cex.axis=0.8,
xlab="ave # of UMI")A simple solution for normalization and stablizing expression varaince across genes is to tranform the count data by log(count/size.factor + 1). One may calcualte size.factor per cell as the total number of UMIs, and this assumes the total expression are the same across all the cells. However, the total expression of each cell may vary with respect to cell type and/or cell size, and the computeSumFactors function in R package scran provides a more sophisicated way to calcualte size.factor to allow such variaation across cells (Lun, Bach, and Marioni 2016). computeSumFactors can use initial clustering of cells to normalize expression within and beetween clusters. Within a cluster, it estimates the size factor for many groups of cells so that there are more groups than cells, and then it can calcualte the size factor per cell using a lienar deconvolution system. We remove all the cells with negative or very small size factors (< 0.01).
As shown in the following plot, the final size factor estimation is indeed highly correlated with the naive definition by total count.
Finally, the command normalize(sce) adds the normalized expression into the variable sce, which can be accessed by logcounts(sce) = log2(gene_cell_count / size_factor + 1).
date()## [1] "Wed Jan 30 11:53:13 2019"
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
table(clusters)## clusters
## 1 2 3 4 5 6
## 503 1590 1591 2864 2062 4571
date()## [1] "Wed Jan 30 11:53:57 2019"
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
date()## [1] "Wed Jan 30 11:55:52 2019"
summary(sizeFactors(sce))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0927 0.4487 0.7515 1.0000 1.2610 9.7585
sort(sizeFactors(sce))[1:30]## [1] 0.09269791 0.10089499 0.10199474 0.11052233 0.11186730 0.11451162
## [7] 0.11579893 0.11606944 0.11686686 0.11799359 0.12004139 0.12039394
## [13] 0.12144386 0.12297642 0.12451394 0.12485572 0.12649517 0.12703992
## [19] 0.13203308 0.13261365 0.13403081 0.13450431 0.13482033 0.13840999
## [25] 0.13862154 0.13927866 0.14119765 0.14164402 0.14274874 0.14293551
dim(sce)## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.01)]
dim(sce)## [1] 10607 13181
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors",
cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
abline(h=0.05)dim(sce)## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.05)]
dim(sce)## [1] 10607 13181
sce = normalize(sce) For dimension reduction, such as calculating PCA or performing TSNE, we should start by identifying a subset of genes with high level of biological signal relative to background (technical) noise. These genes are referred to as HVGs (highly variable genes). The decomposeVar function from R/cran is designed for this task.
new_trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6),
xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new_trend(x), col="red", lwd=2, add=TRUE)
legend("top", legend=c("Poisson noise", "observed trend"),
lty=1, lwd=2, col=c("red", "orange"), bty="n")fit$trend = new_trend
dec = decomposeVar(fit=fit)
top_dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top_dec)[1:10])When performing PCA, we can use all the genes or just those genes with high signal-to-noise ratio (HGVs). TSNE analysis is usually based on the top PCs rather than the original gene expression data. We first perform PCA using all the genes and the function denoisePCA can automatically select the PCs based on modeling of technical noise.
date()## [1] "Wed Jan 30 11:56:29 2019"
sce = denoisePCA(sce,technical=new_trend,approx=TRUE) # Using the Poisson trend fit
date()## [1] "Wed Jan 30 11:58:11 2019"
dim(reducedDim(sce,"PCA"))## [1] 13181 94
par(mfrow=c(1,1))
plot(log10(attr(reducedDim(sce), "percentVar")), xlab="PC",
ylab="log10(Prop of variance explained)", pch=20, cex=0.6,
col=rgb(0.8, 0.2, 0.2, 0.5))
abline(v=ncol(reducedDim(sce,"PCA")), lty=2, col="red")df_redDim = smart_df(colData(sce)[,c("sample_name","part_cell_id",
paste0("Habib_TSNE",1:2),"log10_total_features","Habib_clusters",
"Habib_clusters_name","Habib_cell_type")],
reducedDim(sce, "PCA")[,1:2])
rownames(df_redDim) = NULL
ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")date()## [1] "Wed Jan 30 11:58:16 2019"
sce = runTSNE(sce,use_dimred="PCA",perplexity=30,rand_seed=100)
date()## [1] "Wed Jan 30 12:00:55 2019"
tmp_df = smart_df(reducedDim(sce,"TSNE"))
rownames(tmp_df) = NULL
names(tmp_df) = paste0("my_TSNE",1:2)
df_redDim = smart_df(df_redDim,tmp_df); rm(tmp_df)
df_redDim[1:5,]## sample_name part_cell_id Habib_TSNE1 Habib_TSNE2
## 1 hHP1_AACACTATCTAC hHP1 -8.436537 7.721106
## 2 hHP1_CTACGCATCCAT hHP1 -6.606278 8.099538
## 3 hHP1_TCGGTACTAATA hHP1 -6.724967 8.011943
## 4 hHP1_CCCGCACGCTAT hHP1 -6.286364 7.892359
## 5 hHP1_TCATTTTGTCAT hHP1 -10.455725 2.374719
## log10_total_features Habib_clusters Habib_clusters_name Habib_cell_type
## 1 3.393048 4 exCA3 exCA
## 2 3.453624 3 exCA1 exCA
## 3 3.458638 3 exCA1 exCA
## 4 3.419129 3 exCA1 exCA
## 5 3.345178 3 exCA1 exCA
## PC1 PC2 my_TSNE1 my_TSNE2
## 1 3.326051 -1.240640 20.77887 11.296457
## 2 3.747268 -1.075909 20.85935 8.749623
## 3 4.543743 -1.682305 22.32118 6.832821
## 4 4.255275 -1.072497 22.64867 7.073299
## 5 3.136629 -1.083864 20.22245 4.447358
ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")saveRDS(list(sce = sce,dec = dec,df_redDim = df_redDim),
file.path(dronc_dir,"post_redDim_all_genes.rds"))We select around top 1,000 genes (based on FDR and biological residual thresholds) for the PCA and use the top 50 PCs for TSNE projection. When calculating PCs, we use log-transformed normalized gene expression data: log2(norm_express+1).
library(svd)
library(Rtsne)
summary(dec$bio)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.022095 0.000137 0.003208 0.009682 0.008864 5.261563
dec1 = dec
dec1$bio[which(dec$bio < 1e-5)] = 1e-5
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100
par(mfrow=c(1,2))
hist(log10(dec1$bio),breaks=100,main="")
hist(log10(dec1$FDR),breaks=100,main="")summary(dec$FDR[dec$bio > 0.01])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 1.356e-05 0.000e+00 2.618e-02
summary(dec$FDR[dec$bio > 0.03])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 5.310e-10 0.000e+00 3.248e-07
table(dec$FDR < 1e-10,dec$bio > 0.03)##
## FALSE TRUE
## FALSE 5471 6
## TRUE 4448 682
table(dec$FDR < 1e-10,dec$bio > 0.01)##
## FALSE TRUE
## FALSE 5347 130
## TRUE 2892 2238
table(dec$FDR < 1e-10,dec$bio > 0.02)##
## FALSE TRUE
## FALSE 5455 22
## TRUE 4044 1086
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.02)
sce_hvg = sce[w2kp,]
sce_hvg## class: SingleCellExperiment
## dim: 1086 13181
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(20): hgnc_symbol external_gene_name ...
## n_cells_counts pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
edat = t(as.matrix(logcounts(sce_hvg)))
edat = scale(edat)
dim(edat)## [1] 13181 1086
edat[1:2,1:3]## AATK AC004158.3 ABLIM1
## hHP1_AACACTATCTAC -0.3259988 -0.1787744 -0.3700985
## hHP1_CTACGCATCCAT 0.1885733 -0.1787744 -0.3700985
date()## [1] "Wed Jan 30 12:01:25 2019"
ppk = propack.svd(edat,neig=50)
date()## [1] "Wed Jan 30 12:01:36 2019"
pca = t(ppk$d*t(ppk$u)) # calculates pc scores aka principal components
tmp_df = smart_df(pca[,1:2])
names(tmp_df) = paste0("HVG_PC",seq(ncol(tmp_df)))
df_hvg = smart_df(colData(sce)[,c("sample_name","part_cell_id",
paste0("Habib_TSNE",1:2),"log10_total_features","Habib_clusters",
"Habib_clusters_name","Habib_cell_type")],tmp_df)
rownames(df_hvg) = NULL
set.seed(100)
date()## [1] "Wed Jan 30 12:01:36 2019"
tsne = Rtsne(pca, pca = FALSE)
date()## [1] "Wed Jan 30 12:04:01 2019"
tmp_df = smart_df(tsne$Y)
names(tmp_df) = paste0("HVG_TSNE",seq(ncol(tmp_df)))
df_hvg = smart_df(df_hvg,tmp_df)
ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")reducedDims(sce_hvg) = SimpleList(PCA=pca, TSNE=tsne$Y)
sce_hvg## class: SingleCellExperiment
## dim: 1086 13181
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(20): hgnc_symbol external_gene_name ...
## n_cells_counts pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
saveRDS(sce_hvg,file.path(dronc_dir,"post_HVG.rds"))Next we cluster the cells using a simple kmeans method on the top 50 PCs. We set the number of clusters to be 5 thru 15, to include the 12 cell types reported by (Habib et al. 2017).
all_num_clust = c(5:15)
df_hvg = df_hvg[,!grepl("^KM_",names(df_hvg))]
for(num_clust in all_num_clust){
cat(paste0("KM with ",num_clust," clusters.\n"))
kmeans_out = kmeans(reducedDim(sce_hvg, "PCA"), centers = num_clust,
iter.max = 1e8, nstart = 2500,
algorithm = "MacQueen")
km_label = paste0("KM_",num_clust)
df_hvg[[km_label]] = as.factor(kmeans_out$cluster)
print(smart_table(df_hvg$Habib_cell_type,kmeans_out$cluster))
print(smart_table(df_hvg$Habib_clusters_name,kmeans_out$cluster))
print(ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = km_label,TYPE = "cat"))
}## KM with 5 clusters.
##
## 1 2 3 4 5
## ASC 58 0 25 1573 3
## END 118 0 1 0 1
## exCA 1113 1 7 6 0
## exDG 1418 0 3 0 0
## exPFC 3218 4 61 52 6
## GABA 1636 1 3 4 2
## MG 36 211 23 9 2
## NSC 3 153 0 5 0
## ODC 49 2 2576 6 6
## OPC 29 0 7 2 540
## <NA> 109 71 17 11 0
##
## 1 2 3 4 5
## ASC1 39 0 14 1023 2
## ASC2 19 0 11 550 1
## END 118 0 1 0 1
## exCA1 380 1 4 5 0
## exCA3 733 0 3 1 0
## exDG 1418 0 3 0 0
## exPFC1 2967 2 36 37 6
## exPFC2 251 2 25 15 0
## GABA1 868 1 3 1 1
## GABA2 768 0 0 3 1
## MG 36 211 23 9 2
## NSC 3 153 0 5 0
## ODC1 28 2 1445 5 4
## ODC2 21 0 1131 1 2
## OPC 29 0 7 2 540
## <NA> 109 71 17 11 0
## KM with 6 clusters.
##
## 1 2 3 4 5 6
## ASC 0 1573 25 58 0 3
## END 0 0 1 118 0 1
## exCA 0 7 7 1112 1 0
## exDG 0 0 3 1418 0 0
## exPFC 0 52 61 3220 2 6
## GABA 0 4 3 1636 1 2
## MG 0 7 13 30 230 1
## NSC 144 9 0 8 0 0
## ODC 0 6 2575 49 3 6
## OPC 0 2 7 29 0 540
## <NA> 1 12 17 112 66 0
##
## 1 2 3 4 5 6
## ASC1 0 1023 14 39 0 2
## ASC2 0 550 11 19 0 1
## END 0 0 1 118 0 1
## exCA1 0 6 4 379 1 0
## exCA3 0 1 3 733 0 0
## exDG 0 0 3 1418 0 0
## exPFC1 0 37 36 2969 0 6
## exPFC2 0 15 25 251 2 0
## GABA1 0 1 3 868 1 1
## GABA2 0 3 0 768 0 1
## MG 0 7 13 30 230 1
## NSC 144 9 0 8 0 0
## ODC1 0 5 1444 28 3 4
## ODC2 0 1 1131 21 0 2
## OPC 0 2 7 29 0 540
## <NA> 1 12 17 112 66 0
## KM with 7 clusters.
##
## 1 2 3 4 5 6 7
## ASC 51 25 0 7 3 1573 0
## END 115 1 0 3 1 0 0
## exCA 1103 7 1 9 0 7 0
## exDG 1417 3 0 1 0 0 0
## exPFC 3167 61 2 53 6 52 0
## GABA 295 3 1 1344 0 3 0
## MG 30 13 230 0 1 7 0
## NSC 7 0 0 0 0 10 144
## ODC 42 2574 3 8 6 6 0
## OPC 25 6 0 4 541 2 0
## <NA> 60 17 66 52 0 12 1
##
## 1 2 3 4 5 6 7
## ASC1 34 14 0 5 2 1023 0
## ASC2 17 11 0 2 1 550 0
## END 115 1 0 3 1 0 0
## exCA1 375 4 1 4 0 6 0
## exCA3 728 3 0 5 0 1 0
## exDG 1417 3 0 1 0 0 0
## exPFC1 2919 36 0 50 6 37 0
## exPFC2 248 25 2 3 0 15 0
## GABA1 278 3 1 591 0 1 0
## GABA2 17 0 0 753 0 2 0
## MG 30 13 230 0 1 7 0
## NSC 7 0 0 0 0 10 144
## ODC1 25 1444 3 3 4 5 0
## ODC2 17 1130 0 5 2 1 0
## OPC 25 6 0 4 541 2 0
## <NA> 60 17 66 52 0 12 1
## KM with 8 clusters.
##
## 1 2 3 4 5 6 7 8
## ASC 1572 25 15 3 0 42 2 0
## END 0 1 113 1 0 2 3 0
## exCA 7 7 828 0 0 274 10 1
## exDG 0 3 1402 0 0 16 0 0
## exPFC 52 59 56 5 0 3136 31 2
## GABA 3 3 15 0 0 452 1172 1
## MG 7 13 11 1 0 20 0 229
## NSC 8 0 9 0 144 0 0 0
## ODC 5 2575 14 6 0 28 8 3
## OPC 2 7 4 539 0 23 3 0
## <NA> 12 17 43 0 1 15 54 66
##
## 1 2 3 4 5 6 7 8
## ASC1 1024 14 9 2 0 27 2 0
## ASC2 548 11 6 1 0 15 0 0
## END 0 1 113 1 0 2 3 0
## exCA1 6 4 299 0 0 74 6 1
## exCA3 1 3 529 0 0 200 4 0
## exDG 0 3 1402 0 0 16 0 0
## exPFC1 37 35 53 5 0 2890 28 0
## exPFC2 15 24 3 0 0 246 3 2
## GABA1 1 3 12 0 0 425 432 1
## GABA2 2 0 3 0 0 27 740 0
## MG 7 13 11 1 0 20 0 229
## NSC 8 0 9 0 144 0 0 0
## ODC1 5 1445 7 4 0 18 2 3
## ODC2 0 1130 7 2 0 10 6 0
## OPC 2 7 4 539 0 23 3 0
## <NA> 12 17 43 0 1 15 54 66
## KM with 9 clusters.
##
## 1 2 3 4 5 6 7 8 9
## ASC 4 15 1570 23 42 2 3 0 0
## END 0 113 0 1 2 3 1 0 0
## exCA 0 825 6 9 277 9 0 1 0
## exDG 0 1402 0 3 16 0 0 0 0
## exPFC 6 56 48 59 3136 29 5 2 0
## GABA 0 15 3 3 452 1172 0 1 0
## MG 1 11 7 13 20 0 1 228 0
## NSC 0 9 8 0 0 0 0 0 144
## ODC 902 11 5 1680 25 7 6 3 0
## OPC 5 4 2 6 23 3 535 0 0
## <NA> 0 41 12 18 16 54 0 66 1
##
## 1 2 3 4 5 6 7 8 9
## ASC1 2 9 1023 13 27 2 2 0 0
## ASC2 2 6 547 10 15 0 1 0 0
## END 0 113 0 1 2 3 1 0 0
## exCA1 0 298 5 6 75 5 0 1 0
## exCA3 0 527 1 3 202 4 0 0 0
## exDG 0 1402 0 3 16 0 0 0 0
## exPFC1 3 54 35 35 2890 26 5 0 0
## exPFC2 3 2 13 24 246 3 0 2 0
## GABA1 0 12 1 3 425 432 0 1 0
## GABA2 0 3 2 0 27 740 0 0 0
## MG 1 11 7 13 20 0 1 228 0
## NSC 0 9 8 0 0 0 0 0 144
## ODC1 708 5 5 741 16 2 4 3 0
## ODC2 194 6 0 939 9 5 2 0 0
## OPC 5 4 2 6 23 3 535 0 0
## <NA> 0 41 12 18 16 54 0 66 1
## KM with 10 clusters.
##
## 1 2 3 4 5 6 7 8 9 10
## ASC 42 2 6 1570 15 0 3 0 0 21
## END 2 3 1 0 113 0 1 0 0 0
## exCA 282 11 0 6 819 0 0 0 0 9
## exDG 16 0 0 0 1402 0 0 0 0 3
## exPFC 3133 32 6 47 56 0 5 2 0 60
## GABA 452 1173 0 3 14 0 0 0 1 3
## MG 20 0 0 11 11 0 2 223 0 14
## NSC 0 0 0 8 9 144 0 0 0 0
## ODC 25 6 946 6 11 0 6 2 0 1637
## OPC 23 3 6 2 4 0 535 0 0 5
## <NA> 16 53 2 13 39 0 0 2 68 15
##
## 1 2 3 4 5 6 7 8 9 10
## ASC1 27 2 3 1023 9 0 2 0 0 12
## ASC2 15 0 3 547 6 0 1 0 0 9
## END 2 3 1 0 113 0 1 0 0 0
## exCA1 75 7 0 5 297 0 0 0 0 6
## exCA3 207 4 0 1 522 0 0 0 0 3
## exDG 16 0 0 0 1402 0 0 0 0 3
## exPFC1 2887 29 3 34 54 0 5 0 0 36
## exPFC2 246 3 3 13 2 0 0 2 0 24
## GABA1 425 433 0 1 11 0 0 0 1 3
## GABA2 27 740 0 2 3 0 0 0 0 0
## MG 20 0 0 11 11 0 2 223 0 14
## NSC 0 0 0 8 9 144 0 0 0 0
## ODC1 16 2 725 5 5 0 4 2 0 725
## ODC2 9 4 221 1 6 0 2 0 0 912
## OPC 23 3 6 2 4 0 535 0 0 5
## <NA> 16 53 2 13 39 0 0 2 68 15
## KM with 11 clusters.
##
## 1 2 3 4 5 6 7 8 9 10 11
## ASC 0 1067 20 1 0 518 4 12 36 0 1
## END 0 1 1 3 0 0 0 112 2 0 1
## exCA 0 9 8 11 0 0 0 814 285 0 0
## exDG 0 0 3 0 0 0 0 1402 16 0 0
## exPFC 0 46 54 32 2 10 6 56 3130 0 5
## GABA 0 4 3 1172 0 0 0 14 452 1 0
## MG 0 9 14 0 222 2 1 11 20 0 2
## NSC 144 7 0 0 0 2 0 8 0 0 0
## ODC 0 7 1683 6 2 0 899 11 25 0 6
## OPC 0 3 6 3 0 1 5 4 23 0 533
## <NA> 0 10 17 53 2 2 0 39 16 68 1
##
## 1 2 3 4 5 6 7 8 9 10 11
## ASC1 0 842 11 1 0 191 2 6 25 0 0
## ASC2 0 225 9 0 0 327 2 6 11 0 1
## END 0 1 1 3 0 0 0 112 2 0 1
## exCA1 0 7 5 7 0 0 0 296 75 0 0
## exCA3 0 2 3 4 0 0 0 518 210 0 0
## exDG 0 0 3 0 0 0 0 1402 16 0 0
## exPFC1 0 36 31 29 0 6 3 54 2884 0 5
## exPFC2 0 10 23 3 2 4 3 2 246 0 0
## GABA1 0 1 3 433 0 0 0 11 425 1 0
## GABA2 0 3 0 739 0 0 0 3 27 0 0
## MG 0 9 14 0 222 2 1 11 20 0 2
## NSC 144 7 0 0 0 2 0 8 0 0 0
## ODC1 0 6 744 2 2 0 705 5 16 0 4
## ODC2 0 1 939 4 0 0 194 6 9 0 2
## OPC 0 3 6 3 0 1 5 4 23 0 533
## <NA> 0 10 17 53 2 2 0 39 16 68 1
## KM with 12 clusters.
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 2 1066 0 1 4 11 20 0 0 518 3 34
## END 3 1 0 1 0 105 1 0 0 0 7 2
## exCA 9 8 0 0 0 202 8 0 0 0 748 152
## exDG 0 0 0 0 0 1379 3 0 0 0 23 16
## exPFC 20 46 2 5 6 38 54 0 0 10 53 3107
## GABA 1154 4 0 0 0 4 3 0 1 0 37 443
## MG 0 9 223 1 1 9 14 0 0 2 4 18
## NSC 0 7 0 0 0 6 0 145 0 2 1 0
## ODC 5 7 2 6 899 10 1683 0 0 0 1 26
## OPC 3 3 0 533 5 5 6 0 0 1 2 20
## <NA> 46 10 2 1 0 40 17 0 68 2 5 17
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## ASC1 1 841 0 0 2 6 11 0 0 191 2 24
## ASC2 1 225 0 1 2 5 9 0 0 327 1 10
## END 3 1 0 1 0 105 1 0 0 0 7 2
## exCA1 5 7 0 0 0 179 5 0 0 0 118 76
## exCA3 4 1 0 0 0 23 3 0 0 0 630 76
## exDG 0 0 0 0 0 1379 3 0 0 0 23 16
## exPFC1 17 36 0 5 3 36 31 0 0 6 49 2865
## exPFC2 3 10 2 0 3 2 23 0 0 4 4 242
## GABA1 414 1 0 0 0 3 3 0 1 0 33 419
## GABA2 740 3 0 0 0 1 0 0 0 0 4 24
## MG 0 9 223 1 1 9 14 0 0 2 4 18
## NSC 0 7 0 0 0 6 0 145 0 2 1 0
## ODC1 1 6 2 4 705 5 744 0 0 0 0 17
## ODC2 4 1 0 2 194 5 939 0 0 0 1 9
## OPC 3 3 0 533 5 5 6 0 0 1 2 20
## <NA> 46 10 2 1 0 40 17 0 68 2 5 17
## KM with 13 clusters.
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 4 3 0 1073 0 0 35 5 11 1 16 509 2
## END 7 0 0 1 0 0 2 0 105 1 1 0 3
## exCA 748 0 0 7 0 0 152 0 200 0 11 0 9
## exDG 23 0 0 0 0 0 15 0 1379 0 4 0 0
## exPFC 53 6 0 41 2 0 3108 11 37 5 49 10 19
## GABA 37 0 0 3 0 1 438 0 4 0 5 0 1158
## MG 4 1 0 10 221 0 16 1 10 1 16 1 0
## NSC 1 0 144 6 0 0 0 0 7 0 0 3 0
## ODC 1 688 0 5 1 0 24 719 8 6 1182 0 5
## OPC 2 5 0 2 0 0 20 0 5 534 6 1 3
## <NA> 5 0 0 10 2 68 17 3 40 0 14 3 46
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC1 2 1 0 855 0 0 25 4 6 0 8 176 1
## ASC2 2 2 0 218 0 0 10 1 5 1 8 333 1
## END 7 0 0 1 0 0 2 0 105 1 1 0 3
## exCA1 118 0 0 6 0 0 76 0 177 0 8 0 5
## exCA3 630 0 0 1 0 0 76 0 23 0 3 0 4
## exDG 23 0 0 0 0 0 15 0 1379 0 4 0 0
## exPFC1 49 3 0 31 0 0 2866 1 35 5 36 6 16
## exPFC2 4 3 0 10 2 0 242 10 2 0 13 4 3
## GABA1 33 0 0 1 0 1 414 0 3 0 5 0 417
## GABA2 4 0 0 2 0 0 24 0 1 0 0 0 741
## MG 4 1 0 10 221 0 16 1 10 1 16 1 0
## NSC 1 0 144 6 0 0 0 0 7 0 0 3 0
## ODC1 0 623 0 4 1 0 17 172 3 4 659 0 1
## ODC2 1 65 0 1 0 0 7 547 5 2 523 0 4
## OPC 2 5 0 2 0 0 20 0 5 534 6 1 3
## <NA> 5 0 0 10 2 68 17 3 40 0 14 3 46
## KM with 14 clusters.
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 2 770 5 0 453 10 34 0 0 367 0 4 13
## END 7 1 0 0 0 105 2 1 0 0 0 0 1
## exCA 748 7 0 0 1 200 152 0 0 0 0 0 11
## exDG 23 0 0 0 0 1379 15 0 0 0 0 0 4
## exPFC 53 43 11 0 7 37 3107 5 2 7 0 6 44
## GABA 37 6 0 0 0 4 440 0 0 0 1 0 5
## MG 4 9 1 0 2 10 16 2 219 1 0 1 16
## NSC 1 3 0 145 1 6 0 0 0 5 0 0 0
## ODC 1 6 718 0 0 8 25 5 1 0 0 689 1181
## OPC 2 2 0 0 1 5 20 533 0 1 0 5 6
## <NA> 5 11 3 0 1 39 17 0 2 2 68 0 14
##
## 14
## ASC 1
## END 3
## exCA 8
## exDG 0
## exPFC 19
## GABA 1153
## MG 0
## NSC 0
## ODC 5
## OPC 3
## <NA> 46
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC1 1 578 4 0 426 5 24 0 0 30 0 1 8
## ASC2 1 192 1 0 27 5 10 0 0 337 0 3 5
## END 7 1 0 0 0 105 2 1 0 0 0 0 1
## exCA1 118 6 0 0 1 177 76 0 0 0 0 0 8
## exCA3 630 1 0 0 0 23 76 0 0 0 0 0 3
## exDG 23 0 0 0 0 1379 15 0 0 0 0 0 4
## exPFC1 49 32 1 0 5 35 2865 5 0 4 0 3 33
## exPFC2 4 11 10 0 2 2 242 0 2 3 0 3 11
## GABA1 33 3 0 0 0 3 416 0 0 0 1 0 5
## GABA2 4 3 0 0 0 1 24 0 0 0 0 0 0
## MG 4 9 1 0 2 10 16 2 219 1 0 1 16
## NSC 1 3 0 145 1 6 0 0 0 5 0 0 0
## ODC1 0 5 169 0 0 3 17 3 1 0 0 625 660
## ODC2 1 1 549 0 0 5 8 2 0 0 0 64 521
## OPC 2 2 0 0 1 5 20 533 0 1 0 5 6
## <NA> 5 11 3 0 1 39 17 0 2 2 68 0 14
##
## 14
## ASC1 1
## ASC2 0
## END 3
## exCA1 4
## exCA3 4
## exDG 0
## exPFC1 16
## exPFC2 3
## GABA1 413
## GABA2 740
## MG 0
## NSC 0
## ODC1 1
## ODC2 4
## OPC 3
## <NA> 46
## KM with 15 clusters.
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 455 33 0 3 2 10 0 6 0 0 777 13 359
## END 0 2 0 0 8 104 0 0 0 0 1 1 0
## exCA 1 147 0 0 758 195 0 0 0 0 7 11 0
## exDG 0 15 0 0 22 1380 0 0 0 0 0 4 0
## exPFC 7 3106 1 13 53 37 0 5 0 6 44 43 7
## GABA 0 439 0 0 37 4 0 0 1 0 6 5 0
## MG 2 16 219 1 4 10 0 1 0 2 9 16 1
## NSC 1 0 0 0 1 6 145 0 0 0 3 0 5
## ODC 0 25 1 754 1 8 0 681 0 2 5 1153 0
## OPC 1 20 0 0 2 5 0 5 0 520 2 4 1
## <NA> 1 17 2 3 5 39 0 0 68 0 11 14 2
##
## 14 15
## ASC 1 0
## END 3 1
## exCA 8 0
## exDG 0 0
## exPFC 19 0
## GABA 1154 0
## MG 0 0
## NSC 0 0
## ODC 6 3
## OPC 3 15
## <NA> 46 0
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC1 428 24 0 2 1 5 0 3 0 0 577 8 29
## ASC2 27 9 0 1 1 5 0 3 0 0 200 5 330
## END 0 2 0 0 8 104 0 0 0 0 1 1 0
## exCA1 1 73 0 0 125 173 0 0 0 0 6 8 0
## exCA3 0 74 0 0 633 22 0 0 0 0 1 3 0
## exDG 0 15 0 0 22 1380 0 0 0 0 0 4 0
## exPFC1 5 2864 0 1 49 35 0 3 0 6 32 33 4
## exPFC2 2 242 1 12 4 2 0 2 0 0 12 10 3
## GABA1 0 415 0 0 33 3 0 0 1 0 3 5 0
## GABA2 0 24 0 0 4 1 0 0 0 0 3 0 0
## MG 2 16 219 1 4 10 0 1 0 2 9 16 1
## NSC 1 0 0 0 1 6 145 0 0 0 3 0 5
## ODC1 0 17 1 200 0 3 0 613 0 1 5 641 0
## ODC2 0 8 0 554 1 5 0 68 0 1 0 512 0
## OPC 1 20 0 0 2 5 0 5 0 520 2 4 1
## <NA> 1 17 2 3 5 39 0 0 68 0 11 14 2
##
## 14 15
## ASC1 1 0
## ASC2 0 0
## END 3 1
## exCA1 4 0
## exCA3 4 0
## exDG 0 0
## exPFC1 16 0
## exPFC2 3 0
## GABA1 414 0
## GABA2 740 0
## MG 0 0
## NSC 0 0
## ODC1 1 2
## ODC2 5 1
## OPC 3 15
## <NA> 46 0
saveRDS(list(sce_hvg=sce_hvg,df_hvg=df_hvg,all_num_clust=all_num_clust),
file.path(dronc_dir,"post_kmeans.rds"))Next seek to cluster cells using another method SC3. The code used here is based on SC3 manual. By default, when there are more than 5000 genes, SC3 will
selects a subset of cells uniformly at random (5,000), and obtains clusters from this subset. The inferred labels can be used to train a Support Vector Machine (SVM), which is employed to assign labels to the remaining cells.
By default, SC3 filter genes to select those with dropout percentage between 10 and 90%.
summary(rowData(sce)$pct_dropout_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.34 93.16 96.08 94.26 97.71 99.70
table(rowData(sce)$pct_dropout_counts < 90)##
## FALSE TRUE
## 9189 1418
This will end up with 1418 genes. However, we found the clustering resutls using these 1418 genes have considerable discrepency with clustering resutls from Kmeans and cell types reported by (Habib et al. 2017). Therefore, we chose to use those genes identified from previous step for SC3. Following the recommendation for runing SC3, we first clustering 2000 cells and then run SVM and predict labels of all other cells.
library(SC3)
rowData(sce_hvg)$feature_symbol = rowData(sce_hvg)$external_gene_name
date()## [1] "Wed Jan 30 13:02:34 2019"
all_ks = c(10,12)
sce_hvg = sc3(sce_hvg,gene_filter = FALSE,
n_cores = num_cores,ks = all_ks,biology = TRUE,
rand_seed = 100,svm_num_cells = 2000)## Setting SC3 parameters...
## Your dataset contains more than 2000 cells. Adjusting the nstart parameter of kmeans to 50 for faster performance...
## Defining training cells for SVM using svm_num_cells parameter...
## Calculating distances between the cells...
## Performing transformations and calculating eigenvectors...
## Performing k-means clustering...
## Calculating consensus matrix...
## Calculating biology...
date()## [1] "Wed Jan 30 14:02:48 2019"
dim(colData(sce_hvg))## [1] 13181 70
colData(sce_hvg)[1:2,1:5]## DataFrame with 2 rows and 5 columns
## sample_name part_cell_id Habib_clusters
## <character> <character> <factor>
## hHP1_AACACTATCTAC hHP1_AACACTATCTAC hHP1 4
## hHP1_CTACGCATCCAT hHP1_CTACGCATCCAT hHP1 3
## Habib_clusters_name Habib_cell_type
## <character> <character>
## hHP1_AACACTATCTAC exCA3 exCA
## hHP1_CTACGCATCCAT exCA1 exCA
names(colData(sce_hvg))## [1] "sample_name"
## [2] "part_cell_id"
## [3] "Habib_clusters"
## [4] "Habib_clusters_name"
## [5] "Habib_cell_type"
## [6] "Habib_TSNE1"
## [7] "Habib_TSNE2"
## [8] "is_cell_control"
## [9] "total_features_by_counts"
## [10] "log10_total_features_by_counts"
## [11] "total_counts"
## [12] "log10_total_counts"
## [13] "pct_counts_in_top_50_features"
## [14] "pct_counts_in_top_100_features"
## [15] "pct_counts_in_top_200_features"
## [16] "pct_counts_in_top_500_features"
## [17] "total_features"
## [18] "log10_total_features"
## [19] "pct_counts_top_50_features"
## [20] "pct_counts_top_100_features"
## [21] "pct_counts_top_200_features"
## [22] "pct_counts_top_500_features"
## [23] "total_features_by_counts_endogenous"
## [24] "log10_total_features_by_counts_endogenous"
## [25] "total_counts_endogenous"
## [26] "log10_total_counts_endogenous"
## [27] "pct_counts_endogenous"
## [28] "pct_counts_in_top_50_features_endogenous"
## [29] "pct_counts_in_top_100_features_endogenous"
## [30] "pct_counts_in_top_200_features_endogenous"
## [31] "pct_counts_in_top_500_features_endogenous"
## [32] "total_features_endogenous"
## [33] "log10_total_features_endogenous"
## [34] "pct_counts_top_50_features_endogenous"
## [35] "pct_counts_top_100_features_endogenous"
## [36] "pct_counts_top_200_features_endogenous"
## [37] "pct_counts_top_500_features_endogenous"
## [38] "total_features_by_counts_feature_control"
## [39] "log10_total_features_by_counts_feature_control"
## [40] "total_counts_feature_control"
## [41] "log10_total_counts_feature_control"
## [42] "pct_counts_feature_control"
## [43] "pct_counts_in_top_50_features_feature_control"
## [44] "pct_counts_in_top_100_features_feature_control"
## [45] "total_features_feature_control"
## [46] "log10_total_features_feature_control"
## [47] "pct_counts_top_50_features_feature_control"
## [48] "pct_counts_top_100_features_feature_control"
## [49] "total_features_by_counts_Mt"
## [50] "log10_total_features_by_counts_Mt"
## [51] "total_counts_Mt"
## [52] "log10_total_counts_Mt"
## [53] "pct_counts_Mt"
## [54] "total_features_Mt"
## [55] "log10_total_features_Mt"
## [56] "total_features_by_counts_Ri"
## [57] "log10_total_features_by_counts_Ri"
## [58] "total_counts_Ri"
## [59] "log10_total_counts_Ri"
## [60] "pct_counts_Ri"
## [61] "pct_counts_in_top_50_features_Ri"
## [62] "pct_counts_in_top_100_features_Ri"
## [63] "total_features_Ri"
## [64] "log10_total_features_Ri"
## [65] "pct_counts_top_50_features_Ri"
## [66] "pct_counts_top_100_features_Ri"
## [67] "sc3_10_clusters"
## [68] "sc3_12_clusters"
## [69] "sc3_10_log2_outlier_score"
## [70] "sc3_12_log2_outlier_score"
date()## [1] "Wed Jan 30 14:02:48 2019"
sce_hvg = sc3_run_svm(sce_hvg, ks = all_ks)
date()## [1] "Wed Jan 30 14:03:53 2019"
Next we compare the clustering results from SC3 and the reported cell types.
for(one_ks in all_ks){
sc3_label = paste0("sc3_",one_ks,"_clusters")
df_hvg[[sc3_label]] = as.factor(colData(sce_hvg)[,sc3_label])
print(smart_table(df_hvg$Habib_cell_type, df_hvg[[sc3_label]]))
print(ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = sc3_label,TYPE = "cat"))
}##
## 1 2 3 4 5 6 7 8 9 10
## ASC 23 242 1323 4 4 11 15 37 0 0
## END 1 7 0 1 0 16 86 6 0 3
## exCA 9 149 6 2 2 25 128 758 1 47
## exDG 4 7 0 0 0 208 1165 34 0 3
## exPFC 63 25 41 12 124 772 373 1908 3 20
## GABA 4 2 3 13 40 233 97 599 114 541
## MG 23 2 38 183 0 3 9 23 0 0
## NSC 1 2 33 115 0 1 7 2 0 0
## ODC 2581 2 7 4 1 12 8 23 0 1
## OPC 11 1 97 442 1 4 5 16 0 1
## <NA> 18 3 19 73 39 7 11 16 5 17
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 34 1 2 19 2 242 5 14 1323 1 11 5
## END 2 0 0 3 19 0 11 84 1 0 0 0
## exCA 256 2 0 125 15 0 683 26 5 0 15 0
## exDG 27 0 1 6 207 0 16 1162 0 0 2 0
## exPFC 2223 10 12 51 18 9 33 154 44 2 770 15
## GABA 596 132 0 8 644 0 19 33 2 16 196 0
## MG 23 178 9 12 0 2 4 5 10 30 6 2
## NSC 1 120 1 0 0 2 2 1 7 27 0 0
## ODC 19 3 736 1440 2 1 2 8 8 1 168 251
## OPC 15 0 1 7 1 0 2 5 96 0 2 449
## <NA> 17 66 5 9 20 0 14 6 13 47 6 5
saveRDS(list(sce_hvg=sce_hvg,df_hvg=df_hvg,all_ks=all_ks),
file.path(dronc_dir,"post_sc3.rds"))We obtainted the cell type and clustering resutls from (Habib et al. 2017). Supplementary Table 10: supplement nmeth.4407-S10.xlsx file. Here we compare the cell type reported by Habib et al. (2017) with ours. Habib et al. (2017) identified genes with high variation by fitting a gamma distribution on the relation between mean and coefficient of variation and chose the number of PCs based on “the largest eigen value gap”. It was not clear what is the number of PCs used. Then they used these top PCs to perform tSNE anlaysis and clustering using a graph based method.
tmp_lab = smart_RT(file.path(work_dir,"cluster_num_label.txt"),
sep = "\t",header = TRUE)
tmp_lab## Cell.ID Name Cell.Type.ID Name.1
## 1 1 exPFC1 1 exPFC
## 2 2 exPFC2 1 exPFC
## 3 3 exCA1 2 exCA1
## 4 4 exCA3 3 exCA3
## 5 5 GABA1 4 GABA
## 6 6 GABA2 4 GABA
## 7 7 exDG 5 exDG
## 8 8 ASC1 6 ASC
## 9 9 ASC2 6 ASC
## 10 10 ODC1 7 ODC
## 11 11 ODC2 7 ODC
## 12 12 OPC 8 OPC
## 13 13 MG 9 MG
## 14 14 NSC 10 NSC
## 15 15 END 11 END
tmp_lab = name_change(tmp_lab,"Name","Cluster.Name")
tmp_lab = name_change(tmp_lab,"Name.1","Cell_Type")
tmp_res = smart_RT(file.path(work_dir,"paper_cluster_res.txt"),
sep = "\t",header = TRUE,comment.char = "")
dim(tmp_res); tmp_res[1:5,]## [1] 14963 5
## Cell.ID X.Genes X.Transcripts Cluster.ID Cluster.Name
## 1 hHP1_AGACCGCGACTA 2289 3946 1 exPFC1
## 2 hHP1_AAATCGTAGTAG 1311 2077 1 exPFC1
## 3 hHP1_TCACCAGGCGAT 676 978 1 exPFC1
## 4 hHP1_GCATACAGTGAA 916 1425 1 exPFC1
## 5 hHP1_CCCCCTCAGTAC 481 641 1 exPFC1
tmp_res = name_change(tmp_res,"X.Genes","nGenes")
tmp_res = name_change(tmp_res,"X.Transcripts","nTranscripts")
tmp_res = smart_merge(tmp_res, tmp_lab[,c("Cluster.Name","Cell_Type")],
all.x=TRUE)
summary(tmp_res$nGenes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 201.0 337.0 529.0 658.4 826.0 4249.0
smart_hist(tmp_res$nGenes,breaks=50,main="",xlab="number of genes per cell")df_hvg$Cell.ID = colnames(sce_hvg)
smart_table(df_hvg$Cell.ID %in% tmp_res$Cell.ID)##
## TRUE
## 13181
tmp_res$Cell.ID[!(tmp_res$Cell.ID %in% df_hvg$Cell.ID)][1:5]## [1] "PFC2-A2_GTTGAGGAGCGT" "PFC2-A2_CAAATGCCTGTC" "PFC2-A2_GGTTTGGCCCTN"
## [4] "PFC2-A2_ACCAGGCTTCGA" "PFC2-A1_TCCGTTCTTCGT"
tmp_res$Cell.ID = gsub("-",".",tmp_res$Cell.ID)
table(df_hvg$Cell.ID %in% tmp_res$Cell.ID)##
## FALSE TRUE
## 7812 5369
# Merge and compare
all_clust_res = smart_merge(df_hvg,tmp_res)
sc3_res = smart_df(colData(sce_hvg)[,paste0("sc3_",all_ks,"_clusters")])
for(ks in all_ks){
sc3_label = paste0("sc3_",ks,"_clusters")
sc3_res[,sc3_label] = as.factor(sc3_res[,sc3_label])
}
sc3_res$Cell.ID = colnames(sce_hvg)
all_clust_res = smart_merge(all_clust_res, sc3_res)
dim(all_clust_res)## [1] 5369 31
all_clust_res[1:5,]## Cell.ID sc3_10_clusters sc3_12_clusters sample_name
## 1 hCc_AAAAAGCTACAA 8 1 hCc_AAAAAGCTACAA
## 2 hCc_AAACAGGTGAGG 7 1 hCc_AAACAGGTGAGG
## 3 hCc_AAACCCTTTACA 8 1 hCc_AAACCCTTTACA
## 4 hCc_AAACGTGACGGA 8 1 hCc_AAACGTGACGGA
## 5 hCc_AAAGAACTCGCG 7 1 hCc_AAAGAACTCGCG
## part_cell_id Habib_TSNE1 Habib_TSNE2 log10_total_features Habib_clusters
## 1 hCc 2.804881 -1.888592 2.866287 1
## 2 hCc 16.965708 -6.768078 3.170262 5
## 3 hCc 17.828567 -2.514648 3.044540 5
## 4 hCc 14.756007 -4.965194 2.849419 5
## 5 hCc -1.552130 3.860966 2.716838 1
## Habib_clusters_name Habib_cell_type HVG_PC1 HVG_PC2 HVG_TSNE1
## 1 exPFC1 exPFC -3.809914 -1.148090 -2.054991
## 2 GABA1 GABA -3.041883 -1.756074 -9.557901
## 3 GABA1 GABA -4.335472 -1.149693 -8.302459
## 4 GABA1 GABA -3.614090 -1.958961 -7.234844
## 5 exPFC1 exPFC -4.001498 -2.066558 8.784212
## HVG_TSNE2 KM_5 KM_6 KM_7 KM_8 KM_9 KM_10 KM_11 KM_12 KM_13 KM_14 KM_15
## 1 6.697855 1 4 1 6 5 1 9 12 7 7 2
## 2 20.722796 1 4 4 7 6 2 4 1 13 14 14
## 3 18.022212 1 4 1 6 5 1 9 12 7 7 2
## 4 15.676638 1 4 1 6 5 1 9 12 7 7 2
## 5 10.251190 1 4 1 6 5 1 9 12 7 7 2
## Cluster.Name nGenes nTranscripts Cluster.ID Cell_Type
## 1 exPFC1 736 1034 1 exPFC
## 2 GABA1 1480 2391 5 GABA
## 3 GABA1 1107 1757 5 GABA
## 4 GABA1 708 935 5 GABA
## 5 exPFC1 520 673 1 exPFC
smart_table(all_clust_res[,c("Cell_Type","Cluster.ID")])## Cluster.ID
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 0 0 0 0 0 0 0 380 82 0 0 0 0
## END 0 0 0 0 0 0 0 0 0 0 0 0 0
## exCA1 0 0 90 0 0 0 0 0 0 0 0 0 0
## exCA3 0 0 0 200 0 0 0 0 0 0 0 0 0
## exDG 0 0 0 0 0 0 64 0 0 0 0 0 0
## exPFC 2479 262 0 0 0 0 0 0 0 0 0 0 0
## GABA 0 0 0 0 629 469 0 0 0 0 0 0 0
## MG 0 0 0 0 0 0 0 0 0 0 0 0 60
## ODC 0 0 0 0 0 0 0 0 0 192 190 0 0
## OPC 0 0 0 0 0 0 0 0 0 0 0 181 0
## <NA> 0 0 0 0 0 0 0 0 0 0 0 0 0
## Cluster.ID
## Cell_Type 15 16 18
## ASC 0 0 0
## END 18 0 0
## exCA1 0 0 0
## exCA3 0 0 0
## exDG 0 0 0
## exPFC 0 0 0
## GABA 0 0 0
## MG 0 0 0
## ODC 0 0 0
## OPC 0 0 0
## <NA> 0 50 23
for(num_clust in all_num_clust){
km_label = paste0("KM_",num_clust)
print(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
t2 = melt(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
colnames(t2)[2] = "cluster"
print(summary(t2$value))
print(gg.heatmap(t2,paste0("kmeans ",num_clust," clusters")))
}## KM_5
## Cell_Type 1 2 3 4 5
## ASC 39 0 9 413 1
## END 4 14 0 0 0
## exCA1 88 0 1 1 0
## exCA3 200 0 0 0 0
## exDG 64 0 0 0 0
## exPFC 2690 1 29 17 4
## GABA 1093 0 1 3 1
## MG 18 30 7 5 0
## ODC 35 1 343 2 1
## OPC 22 0 1 0 158
## <NA> 64 1 6 2 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 97.62 20.00 2690.00
## KM_6
## Cell_Type 1 2 3 4 5 6
## ASC 0 413 9 39 0 1
## END 0 0 0 5 13 0
## exCA1 0 1 1 88 0 0
## exCA3 0 0 0 200 0 0
## exDG 0 0 0 64 0 0
## exPFC 0 17 29 2690 1 4
## GABA 0 3 1 1093 0 1
## MG 0 4 4 15 37 0
## ODC 0 2 343 35 1 1
## OPC 0 0 1 22 0 158
## <NA> 0 2 5 64 2 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 81.35 12.00 2690.00
## KM_7
## Cell_Type 1 2 3 4 5 6 7
## ASC 32 9 0 7 1 413 0
## END 5 0 13 0 0 0 0
## exCA1 87 1 0 1 0 1 0
## exCA3 197 0 0 3 0 0 0
## exDG 63 0 0 1 0 0 0
## exPFC 2648 29 1 42 4 17 0
## GABA 226 1 0 869 0 2 0
## MG 15 4 37 0 0 4 0
## ODC 28 343 1 7 1 2 0
## OPC 20 1 0 2 158 0 0
## <NA> 29 5 2 35 0 2 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 69.73 13.00 2648.00
## KM_8
## Cell_Type 1 2 3 4 5 6 7 8
## ASC 413 9 3 1 0 34 2 0
## END 0 0 1 0 0 4 0 13
## exCA1 1 1 33 0 0 53 2 0
## exCA3 0 0 81 0 0 117 2 0
## exDG 0 0 58 0 0 6 0 0
## exPFC 17 27 27 4 0 2642 23 1
## GABA 2 1 3 0 0 366 726 0
## MG 4 4 1 0 0 14 0 37
## ODC 2 344 2 1 0 26 6 1
## OPC 0 1 1 157 0 20 2 0
## <NA> 2 5 20 0 0 9 35 2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 61.01 13.25 2642.00
## KM_9
## Cell_Type 1 2 3 4 5 6 7 8 9
## ASC 0 3 413 9 34 2 1 0 0
## END 0 1 0 0 4 0 0 13 0
## exCA1 0 33 1 1 53 2 0 0 0
## exCA3 0 81 0 0 117 2 0 0 0
## exDG 0 58 0 0 6 0 0 0 0
## exPFC 0 27 16 30 2641 22 4 1 0
## GABA 0 3 2 1 366 726 0 0 0
## MG 0 1 4 4 14 0 0 37 0
## ODC 34 2 2 313 23 6 1 1 0
## OPC 0 1 0 1 20 2 157 0 0
## <NA> 0 19 2 6 9 35 0 2 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 54.23 9.00 2641.00
## KM_10
## Cell_Type 1 2 3 4 5 6 7 8 9 10
## ASC 34 2 1 413 3 0 1 0 0 8
## END 4 0 0 0 1 0 0 0 13 0
## exCA1 54 2 0 1 32 0 0 0 0 1
## exCA3 118 2 0 0 80 0 0 0 0 0
## exDG 6 0 0 0 58 0 0 0 0 0
## exPFC 2639 24 0 15 27 0 4 1 0 31
## GABA 366 726 0 2 3 0 0 0 0 1
## MG 14 0 0 5 1 0 0 35 0 5
## ODC 23 6 37 2 2 0 1 1 0 310
## OPC 20 2 0 0 1 0 157 0 0 1
## <NA> 9 35 0 2 19 0 0 1 0 7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 48.81 6.00 2639.00
## KM_11
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11
## ASC 0 386 9 1 0 35 0 1 30 0 0
## END 0 0 0 0 0 0 0 1 4 13 0
## exCA1 0 1 1 2 0 0 0 32 54 0 0
## exCA3 0 0 0 2 0 0 0 78 120 0 0
## exDG 0 0 0 0 0 0 0 58 6 0 0
## exPFC 0 19 27 24 1 1 0 27 2638 0 4
## GABA 0 3 1 725 0 0 0 3 366 0 0
## MG 0 5 5 0 35 0 0 1 14 0 0
## ODC 0 2 313 6 1 0 34 2 23 0 1
## OPC 0 2 1 2 0 0 0 1 20 0 155
## <NA> 0 2 7 35 1 0 0 19 9 0 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 44.37 5.00 2638.00
## KM_12
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 2 386 0 0 0 1 9 0 0 35 1 28
## END 0 0 0 0 0 1 0 0 13 0 0 4
## exCA1 1 1 0 0 0 12 1 0 0 0 31 44
## exCA3 2 0 0 0 0 7 0 0 0 0 142 49
## exDG 0 0 0 0 0 53 0 0 0 0 5 6
## exPFC 14 19 1 4 0 16 27 0 0 1 38 2621
## GABA 710 3 0 0 0 0 1 0 0 0 11 373
## MG 0 5 35 0 0 1 5 0 0 0 1 13
## ODC 5 2 1 1 34 2 313 0 0 0 0 24
## OPC 2 2 0 155 0 2 1 0 0 0 2 17
## <NA> 31 2 1 0 0 21 7 0 0 0 1 10
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 40.67 5.25 2621.00
## KM_13
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 2 1 0 386 0 0 29 1 1 0 7 33 2
## END 0 0 0 0 0 13 4 0 1 0 0 0 0
## exCA1 31 0 0 1 0 0 44 0 12 0 1 0 1
## exCA3 142 0 0 0 0 0 49 0 7 0 0 0 2
## exDG 5 0 0 0 0 0 6 0 53 0 0 0 0
## exPFC 38 0 0 15 1 0 2622 6 16 4 25 1 13
## GABA 11 0 0 2 0 0 369 0 0 0 2 0 714
## MG 1 0 0 5 34 0 11 0 1 0 8 0 0
## ODC 0 22 0 1 1 0 23 68 2 1 259 0 5
## OPC 2 0 0 1 0 0 17 0 2 156 1 0 2
## <NA> 1 0 0 2 1 0 10 0 21 0 7 0 31
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 37.55 5.50 2622.00
## KM_14
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 0 331 1 0 71 1 28 0 0 21 0 1 7
## END 0 0 0 0 0 1 4 0 0 0 13 0 0
## exCA1 31 1 0 0 0 12 44 0 0 0 0 0 1
## exCA3 142 0 0 0 0 7 49 0 0 0 0 0 0
## exDG 5 0 0 0 0 53 6 0 0 0 0 0 0
## exPFC 38 19 6 0 0 16 2621 4 1 1 0 0 22
## GABA 11 5 0 0 0 0 370 0 0 0 0 0 2
## MG 1 6 0 0 0 1 11 0 33 0 0 0 8
## ODC 0 2 68 0 0 2 23 0 1 0 0 22 259
## OPC 2 2 0 0 0 2 17 155 0 0 0 0 1
## <NA> 1 2 0 0 0 21 10 0 1 0 0 0 7
## KM_14
## Cell_Type 14
## ASC 1
## END 0
## exCA1 1
## exCA3 2
## exDG 0
## exPFC 13
## GABA 710
## MG 0
## ODC 5
## OPC 2
## <NA> 31
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 34.86 5.00 2621.00
## KM_15
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 71 27 0 1 0 1 0 1 0 0 335 7 18
## END 0 4 0 0 0 1 0 0 13 0 0 0 0
## exCA1 0 42 0 0 33 12 0 0 0 0 1 1 0
## exCA3 0 48 0 0 143 7 0 0 0 0 0 0 0
## exDG 0 6 0 0 5 53 0 0 0 0 0 0 0
## exPFC 0 2621 1 6 38 16 0 0 0 4 19 22 1
## GABA 0 370 0 0 11 0 0 0 0 0 5 2 0
## MG 0 11 33 0 1 1 0 0 0 0 6 8 0
## ODC 0 23 1 73 0 2 0 22 0 0 2 254 0
## OPC 0 17 0 0 2 2 0 0 0 155 2 1 0
## <NA> 0 10 1 0 1 21 0 0 0 0 2 7 0
## KM_15
## Cell_Type 14 15
## ASC 1 0
## END 0 0
## exCA1 1 0
## exCA3 2 0
## exDG 0 0
## exPFC 13 0
## GABA 710 0
## MG 0 0
## ODC 5 0
## OPC 2 0
## <NA> 31 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 32.54 4.00 2621.00
for(ks in all_ks){
sc3_label = paste0("sc3_",ks,"_clusters")
print(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
t2 = melt(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
colnames(t2)[2] = "cluster"
print(summary(t2$value))
print(gg.heatmap(t2,paste0("sc3 ",ks," clusters")))
}## sc3_10_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10
## ASC 8 57 355 1 4 10 6 21 0 0
## END 0 0 2 12 0 0 2 1 0 1
## exCA1 2 4 0 0 1 10 9 63 0 1
## exCA3 0 8 0 0 1 8 23 153 0 7
## exDG 0 0 0 0 0 11 48 5 0 0
## exPFC 27 9 16 4 124 771 291 1485 2 12
## GABA 1 1 2 11 40 233 67 414 49 280
## MG 7 0 9 27 0 3 1 13 0 0
## ODC 347 0 1 1 1 12 2 17 0 1
## OPC 2 1 32 126 1 4 2 12 0 1
## <NA> 7 0 2 9 27 6 1 9 4 8
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.50 48.81 12.00 1485.00
## sc3_12_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 22 1 1 6 1 57 1 6 354 1 11 1
## END 3 11 0 0 1 0 0 0 0 2 0 1
## exCA1 51 0 0 6 1 0 21 2 0 0 9 0
## exCA3 92 0 0 17 2 0 81 2 0 0 6 0
## exDG 5 0 0 0 9 0 0 48 0 0 2 0
## exPFC 1735 5 2 25 14 3 16 143 18 1 770 9
## GABA 424 60 0 2 348 0 18 33 1 16 196 0
## MG 11 26 3 4 0 0 2 0 5 5 4 0
## ODC 17 2 63 242 2 0 0 2 1 1 19 33
## OPC 12 0 0 2 1 0 1 2 32 0 2 129
## <NA> 9 6 0 3 10 0 8 1 3 27 5 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 2.00 40.67 11.25 1735.00
We plot our TSNE colored by our clustering results.
all_vars = c("Cell_Type",paste0("KM_",all_num_clust),
paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
print(ggplot_custom(DATA = all_clust_res,X = "HVG_TSNE1",
Y = "HVG_TSNE2",COL = one_var,TYPE = "cat"))
}Next we plot the TSNE reported by (Habib et al. 2017), colored by our clustering results.
all_vars = c("Cell_Type",paste0("KM_",all_num_clust),
paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
print(ggplot_custom(DATA = all_clust_res,X = "Habib_TSNE1",
Y = "Habib_TSNE2",COL = one_var,TYPE = "cat"))
}Finally we save the sce object and the clustering results
saveRDS(sce,file.path(dronc_dir,"sce.rds"))
saveRDS(sce_hvg,file.path(dronc_dir,"sce_hvg.rds"))
saveRDS(all_clust_res,file.path(dronc_dir,"all_clust_res.rds"))sessionInfo()## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] SC3_1.8.0 Rtsne_0.13
## [3] svd_0.4.1 limma_3.36.5
## [5] scran_1.8.4 scater_1.8.4
## [7] ggplot2_3.0.0 biomaRt_2.36.1
## [9] DropletUtils_1.0.3 SingleCellExperiment_1.2.0
## [11] SummarizedExperiment_1.10.1 DelayedArray_0.6.4
## [13] BiocParallel_1.14.2 matrixStats_0.54.0
## [15] Biobase_2.40.0 GenomicRanges_1.32.6
## [17] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [19] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [21] data.table_1.12.0 BiocInstaller_1.30.0
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.6.0 colorspace_1.3-2
## [3] rjson_0.2.20 class_7.3-14
## [5] dynamicTreeCut_1.63-1 rprojroot_1.3-2
## [7] XVector_0.20.0 DT_0.4
## [9] bit64_0.9-7 mvtnorm_1.0-8
## [11] AnnotationDbi_1.42.1 codetools_0.2-15
## [13] tximport_1.8.0 doParallel_1.0.11
## [15] robustbase_0.93-2 knitr_1.20
## [17] cluster_2.0.7-1 pheatmap_1.0.10
## [19] shinydashboard_0.7.0 shiny_1.1.0
## [21] rrcov_1.4-4 compiler_3.5.0
## [23] httr_1.3.1 backports_1.1.2
## [25] assertthat_0.2.0 Matrix_1.2-14
## [27] lazyeval_0.2.1 later_0.7.3
## [29] htmltools_0.3.6 prettyunits_1.0.2
## [31] tools_3.5.0 bindrcpp_0.2.2
## [33] igraph_1.2.2 gtable_0.2.0
## [35] glue_1.3.0 GenomeInfoDbData_1.1.0
## [37] reshape2_1.4.3 dplyr_0.7.6
## [39] doRNG_1.7.1 Rcpp_0.12.18
## [41] gdata_2.18.0 iterators_1.0.10
## [43] DelayedMatrixStats_1.2.0 stringr_1.3.1
## [45] mime_0.5 irlba_2.3.2
## [47] rngtools_1.3.1 gtools_3.8.1
## [49] WriteXLS_4.0.0 statmod_1.4.30
## [51] XML_3.98-1.15 DEoptimR_1.0-8
## [53] edgeR_3.22.3 zlibbioc_1.26.0
## [55] scales_1.0.0 hms_0.4.2
## [57] promises_1.0.1 rhdf5_2.24.0
## [59] RColorBrewer_1.1-2 yaml_2.2.0
## [61] memoise_1.1.0 gridExtra_2.3
## [63] pkgmaker_0.27 stringi_1.2.4
## [65] RSQLite_2.1.1 pcaPP_1.9-73
## [67] foreach_1.4.4 e1071_1.7-0
## [69] caTools_1.17.1.1 bibtex_0.4.2
## [71] rlang_0.2.1 pkgconfig_2.0.1
## [73] bitops_1.0-6 evaluate_0.11
## [75] lattice_0.20-35 ROCR_1.0-7
## [77] purrr_0.2.5 Rhdf5lib_1.2.1
## [79] bindr_0.1.1 htmlwidgets_1.2
## [81] labeling_0.3 cowplot_0.9.3
## [83] bit_1.1-14 tidyselect_0.2.4
## [85] plyr_1.8.4 magrittr_1.5
## [87] R6_2.2.2 gplots_3.0.1
## [89] DBI_1.0.0 pillar_1.3.0
## [91] withr_2.1.2 RCurl_1.95-4.11
## [93] tibble_1.4.2 crayon_1.3.4
## [95] KernSmooth_2.23-15 rmarkdown_1.10
## [97] viridis_0.5.1 progress_1.2.0
## [99] locfit_1.5-9.1 grid_3.5.0
## [101] blob_1.1.1 FNN_1.1.2.1
## [103] digest_0.6.15 xtable_1.8-2
## [105] httpuv_1.4.5 munsell_0.5.0
## [107] registry_0.5 beeswarm_0.2.3
## [109] viridisLite_0.3.0 vipor_0.4.5
Habib, Naomi, Inbal Avraham-Davidi, Anindita Basu, Tyler Burks, Karthik Shekhar, Matan Hofree, Sourav R Choudhury, et al. 2017. “Massively Parallel Single-Nucleus Rna-Seq with Dronc-Seq.” Nature Methods 14 (10). Nature Publishing Group: 955.
Lun, Aaron TL, Karsten Bach, and John C Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell Rna Sequencing Data with Many Zero Counts.” Genome Biology 17 (1). BioMed Central: 75.